Data Mangagement

Skin bleaching interest was extracted from Google Trends as relative search volume (RSV) using the gTrends package. The time period of interest is the beggining of 2008 to the end 2018. Although data is available from 2004 multiple papers suggest that the data is unreliable before 2008.

Analysis

 overall_trend <- gtrends(keyword = "skin bleaching", geo = "US", time = "2008-01-01 2018-12-31", low_search_volume = T, gprop = "web")
 

overall_trend$interest_over_time %>% 
   select(date, hits)  %>% 
 # unite("date", date:day,sep = "-") %>% 
        mutate( yearMonth = yearmonth(date)) %>% 
  select(yearMonth, hits,  -date) %>% 
  as_tsibble(index = yearMonth) %>% 
  gg_subseries(hits) +
  labs(y = "Relative Search Volume", x = "")

overall_trend$interest_over_time %>% 
   select(date, hits)  %>% 
 # unite("date", date:day,sep = "-") %>% 
        mutate( yearMonth = yearmonth(date)) %>% 
  select(yearMonth, hits,  -date) %>% 
  as_tsibble(index = yearMonth) %>% 
  autoplot()+
  labs(y = "Relative Search Volume", x = "")

State wide

#Extracting state wide information

statedf <- c(state.abb, "DC") %>% 
  tibble(states = .) %>% 
  mutate(country = rep("US", 51)) %>% 
  select(country, states) %>% 
  unite("region" , country:states,  sep = "-")
##########################################3
states <- read_csv("./statesTS.csv")

names(states) <- substring(names(states), 8)
#adding regions to data set
states %>% 
  #tibble() %>% 
  #unnest() %>% 
  separate(geo, into = c("country","state")) %>% 
  select(date,hits,state) %>% 
  mutate(yearmonth = yearmonth(date)) %>% 
  select(-date) %>% 
  as_tsibble(index = yearmonth, key = state) %>% 
 filter(state %in% c("NY","CA","TX","NJ")) %>% 
  gg_subseries(hits)

states_q <- states %>% 
  #tibble() %>% 
  #unnest() %>% 
  separate(geo, into = c("country","state")) %>% 
  select(date,hits,state) %>% 
  mutate(yearquater = yearquarter(date)) %>% 
  select(-date) %>% 
  group_by(yearquater, state) %>% 
  summarise(hits = sum(hits)) %>% 
  arrange(state) %>% 
  as_tsibble(index = yearquater, key = state) 

state_region <- tibble(
  name = c(state.name,"DistrictofColumbia"), 
  region = c(as.character(state.region), "South"),   
  state = c(state.abb, "DC")
)

states_q <- left_join(states_q, state_region, by = "state")

states_q %>% 
  mutate(state = factor(state)) %>% 
 filter(state %in% c("NY","CA","TX","NJ")) %>% 
  gg_subseries(hits)

Decomposition

dcmp <- states %>% 
 # tibble() %>% 
  #unnest() %>% 
  separate(geo, into = c("country","state")) %>% 
  select(date,hits,state) %>% 
  mutate(yearmonth = yearmonth(date)) %>% 
  select(-date) %>% 
  as_tsibble(index = yearmonth, key = state) %>% 
 #filter(state %in% c("NY","CA","NJ","TX")) %>% 
  model(stl = STL(hits ~ season(window = 13) + trend(window = 10), robust = T)) %>% 
  components() 

states %>% 
#  tibble() %>% 
#  unnest() %>% 
  separate(geo, into = c("country","state")) %>% 
  select(date,hits,state) %>% 
  mutate(yearmonth = yearmonth(date)) %>% 
  select(-date) %>% 
  as_tsibble(index = yearmonth, key = state) %>% 
# filter(state %in% c("NY","CA","NJ","TX")) %>% 
  autoplot(hits) +
  theme(legend.position = "none") +
  autolayer(dcmp, trend,col = "blue", alpha = 2)

dcmp %>% autoplot()

states_q %>% 
   autoplot(hits) +
  theme(legend.position = "none") +
  autolayer(dcmp, trend,col = "blue", alpha = 2)

dcmp2 <- states_q %>% 
  model(stl = STL(hits ~ season(window = 13) + trend(window = 10), robust = T)) %>% 
  components() 


dcmp2 %>% autoplot() +
  theme(legend.position = "none")

#dcmp %>% gg_subseries(season_year)

#dcmp %>% gg_season(hits, label = "right")

Strongest trend and seasonality

Trend and seasonal strength can help us understand which times series experienced the most change and the amount of variability within during the time period.

`%notin%` <- Negate(`%in%`)

test_s <-  states %>% 
 # tibble() %>% 
 # unnest() %>% 
  separate(geo, into = c("country","state")) %>% 
  select(date,hits,state) %>% 
  mutate(yearmonth = yearmonth(date)) %>% 
  select(-date) %>% 
  as_tsibble(index = yearmonth, key = state) #%>% 
# filter(state %in% c("NY","CA","NJ","TX","MD","DC","NV","GA", "FL","MI"))  
  
#seasonal and trend strength
   
    
  q1 <- states_q %>% #data by quarter
    mutate(region = as_factor(region)) %>% 
  features(hits, feat_stl) %>% 
  ggplot(aes(x = trend_strength, y = seasonal_strength_year)) +
  geom_point(#aes(
  #  col = ifelse(state %in% c("NY","CA","NJ","TX","MD","DC","NV","GA", "FL","MI"), "blue","red"),
  #  col = region)
             ) +
  geom_label_repel(
      aes(label = state ),
        data =  . %>% filter(state  %in% c("NY","CA","NJ","TX","MD","DC","NV","GA", "FL","MI")),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50',
                  size = 3
      ) +
   #   theme(legend.position = "none")+
    scale_fill_viridis_d(aes(col = region))+
    labs(title = "States with Highest Regional SB RSV")
  
  q2 <- states_q %>% #data by quarter
    mutate(region = as_factor(region)) %>% 
  features(hits, feat_stl) %>% 
  ggplot(aes(x = trend_strength, y = seasonal_strength_year)) +
  geom_point(#aes(
  #  col = ifelse(state %in% c("NY","CA","NJ","TX","MD","DC","NV","GA", "FL","MI"), "blue","red"),
  #  col = region)
             ) +
  geom_label_repel(
      aes(label = state ),
        data =  . %>% filter(trend_strength > 0.5 & seasonal_strength_year > 0.28),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50',
                  size = 3
      ) +
   #   theme(legend.position = "none")+
    scale_fill_viridis_d(aes(col = region))+
    labs(title = "Highlights")
      ##################################################
    
  
most_seasonal <- states_q %>% 
      features(hits, feat_stl) %>%
      filter(trend_strength %notin% NaN ) %>% 
      filter(seasonal_strength_year == max(seasonal_strength_year))
    
  t1<-  states_q %>% 
      right_join(most_seasonal, by =c("state")) %>% 
      ggplot(aes(x = yearquater, y = hits)) + geom_line()+
      ggtitle("Texus")
    
most_trend <- states_q %>% 
  features(hits, feat_stl) %>% 
       filter(trend_strength %notin% NaN ) %>% 
  filter(trend_strength == max(trend_strength))

  c1<- states_q %>% 
  right_join(most_trend, by = c("state")) %>% 
  ggplot(aes(x = yearquater, y = hits)) + geom_line() +
     ggtitle("California")
  
  q1 + q2 

  t1 +c1

The plot with the states that have a his SB RSV when utilizing the regional data set from google trends data, now looking closely at their indivifual time series, it suggests states such as NJ, MD, DC, NV may have just always had high SB RSV and are changing as much as the other labeled states who excibit strong trends (increase) and seasonality (predictability) over time. The plot on the right depict a few states with relatively high seasonality and trend which end up all being states that we identified earlier as states with regional RSV above 72 (3rd quartile).

States surrounding FL, GA, MI, NY, CA; though they may not be among those who have high RSV they are interesting still becuase the show similar levels of trend and seasonal strength. This means that they have increased the most…..

ss <- states_q %>% 
  mutate(region = as_factor(region)) %>%
  features(hits, feat_stl) %>% 
  na.omit()
##########################33  merging data set with shp file
state_m <-  
          geo_join(s, #STUSPS
                    ss,
                   by_sp = "STUSPS",
                   by_df = "state",
                   how = "inner"       ) %>% 
  na.omit()

############################### creating legend

legend_tirtile <- function(var1,var2) {
  
  #legend y axis and x axis tirtile intervals
  y = round(
    c(min(var1,na.rm = T), 
      quantile(var1,c(0.333,0.667)), 
      max(var1,na.rm = T)),
    2
  )
  
  z = c(paste0(y[1]," -", "\n",
               "<",y[2]),
        paste0(y[2]," -", "\n",
               "<", y[3]),
        paste0(y[3]," -", "\n",
               y[4]))
  
   yy = round(
    c(min(var2,na.rm = T), 
      quantile(var2,c(0.333,0.667)), 
      max(var2,na.rm = T)),
    2
  )
  
  zz = c(paste0(yy[1]," -", "\n",
               "<",yy[2]),
         paste0(yy[2]," -", "\n",
               "<", yy[3]),
         paste0(yy[3]," -", "\n",
               yy[4]))
  
  #legend data frame
  legend_scale <- data.frame(
  co_var = c(rep(1, 3), rep(2, 3), rep(3, 3)),
  trend.strength = c(rep(seq(1, 3, 1), 3)),
  color = c("#F1F1F1", "#C3DEEE", "#A1D3EA",
            "#F7DBE7", "#CAC8E3", "#A6BDDF",
            "#F7C1CB", "#CAAEC8", "#A6A3C4")
  )
# legend
legend <- ggplot() + 
  geom_tile(
    data = legend_scale,
    aes(x = co_var, y = trend.strength, fill = color)) + 
  scale_fill_identity() +
  
  scale_x_discrete(limits=zz,
      labels=zz)+
  
  scale_y_discrete(limits=z,
        labels=z)+
labs(y =  "Trend Strength →"
   ) +
theme(
    axis.title = element_text(size = 20),
    axis.title.x = element_text( size=30, face = "bold"),
    axis.title.y = element_text( size=30, face = "bold"),
    axis.line = element_blank(),
   # axis.text = element_blank(),
   # axis.ticks = element_blank(),
    panel.grid = element_blank(),
    panel.background = element_blank(),
    plot.margin = margin(10, 10, 10, 10),
    plot.background = element_rect(fill = "transparent", color = NA),
    axis.text.x = element_text(face="bold", color="#000000", size=22),
    axis.text.y = element_text(face="bold", color="#000000", size=22)
  )

legend 
}

#legend plots 

#legend = legend_tirtile(state_m$trend_strength, state_m$seasonal_strength_year) +
  #labs(x = "Seasonal Strength →")  

#ggsave("/Users/stevenlawrence/Desktop/cumc_github/sl4269.github.io/SBL_Global_TS.svg",  plot = legend,  width = 8, height = 6, bg = "transparent")

pal_bivariate <- colorNumeric(c("#F1F1F1", "#C3DEEE", "#A1D3EA",
                          "#F7DBE7", "#CAC8E3", "#A6BDDF",
                          "#F7C1CB", "#CAAEC8", "#A6A3C4"),
            1:9)

bivariate_data<- function(x,y){
            

data <- tibble(yvar = y,
                    xvar = x
                    ) %>% 
              mutate(y.val = case_when(
                  yvar <= yvar %>% quantile(0.333) ~ 3,
                  yvar  > yvar %>% quantile(0.333) & yvar <= yvar %>% quantile(0.667)  ~ 2,
                  yvar > yvar %>% quantile(0.667) ~1
                                      ),
                    x.val= case_when(
                  xvar <= xvar %>% quantile(0.333) ~ 1,
                  xvar  > xvar %>% quantile(0.333) & xvar <= xvar %>% quantile(0.667)  ~ 2,
                  xvar > xvar %>% quantile(0.667) ~3
                                    ),
                    color = case_when(
                      y.val == 1 & x.val == 1 ~ 7,
                       y.val == 1 & x.val == 2 ~ 8,
                       y.val == 1 & x.val == 3 ~ 9,
                       y.val == 2 & x.val == 1 ~ 4,
                       y.val == 2 & x.val == 2 ~ 5,
                       y.val == 2 & x.val == 3 ~ 6,
                       y.val == 3 & x.val == 1 ~ 1,
                       y.val == 3 & x.val == 2 ~ 2,
                       y.val == 3 & x.val == 3 ~ 3,
                    )
                    )
data$color

}
 

labels <- 
            paste0(
                "State: ",
                state_m$NAME, "<br/>",
                "Regioan: ",
                state_m$REGION, "<br/>",
                "Trend Strength: ",
                state_m$trend_strength, "<br/>",
                "Seasonal Strength: ", 
                state_m$seasonal_strength_year) %>%
            lapply(htmltools::HTML)

state_m %>% 
  leaflet(
  width = "100%",
          options = leafletOptions(zoomControl = FALSE)) %>% 
  addProviderTiles("Stadia.AlidadeSmooth") %>% 
  addPolygons(fillColor = state_m$trend_strength %>% 
                bivariate_data(state_m$seasonal_strength_year) %>% pal_bivariate(),
              fillOpacity = 1,
              stroke = FALSE,
              smoothFactor = 0) %>% 
        
        addPolygons(
          label = lapply(labels, htmltools::HTML),
          labelOptions = labelOptions(textsize = "12px"),
          fillColor = NA,
          fillOpacity = 0,
          color = "gray",
          weight = 1,
          opacity = 1,
          highlightOptions = highlightOptions(weight = 2)) %>% 
        
   #     addResetMapButton() %>% 
        
   #     addFullscreenControl() %>% 
        
   #     suspendScroll(sleepNote = F, sleepOpacity = 1) %>% 
        
        addControl(
            
          html = "<img src = 'https://sl4269.github.io/SBL_Global_TS.svg' width = '200' height = '200'>",
          position = "bottomright",
          className = "legend-bivar"
        )

A closer look Seasonal features

#peak hit for each region

                             
test_features <- states_q %>% 
  features(hits, feature_set(pkgs = "feasts"))

# states that don't have information
states_nan <- test_features  %>% 
  filter(is.nan(trend_strength)) %>% 
  select(state) 

# making a data frame that includes regions
#marking problematic states with constant zeroes

test_feature_r <- left_join(test_features, state_region, by= "state" ) %>%
  filter(trend_strength %notin% NaN)

test_feature_r %>%
  select_at(vars(contains("season"), region)) %>%
  mutate(
    seasonal_peak_year = glue::glue("Q{seasonal_peak_year+1}"),
    seasonal_trough_year = glue::glue("Q{seasonal_trough_year+1}"),
  ) %>%
  GGally::ggpairs(mapping = aes(colour=region))

Here I will describe a few plots from the above collection. (i) In the top right corner we observe that states in the North East have high seasonal strength compared to the other states with two states from the south and west regions standing out as outliers. (ii) The bottom plot of the seaonal peak and seasonal trough confirms eallier findings of peak seasonas for all states are in the summer and lowest in the winter.

States in the north east, exibit stronger seasonality.

Using PCA to study time series comparability

Times series have lots of features such as: trend strength, seasonal strength, linearity, spikiness, curvature, and auto correlation, which all tell a unique story of the sereies. PCA helps us to reduce the dimensions of the data to perhaps highlight a new persective of the data.

#principal component analysis
pcs <- test_feature_r %>% 
 select(-state, -region, -name ) %>% 
  filter(!is.nan(trend_strength)) %>% 
  prcomp(scale=F) %>% 
  broom::augment(test_feature_r) 

customGreen0 = "#DeF7E9"

customGreen = "#71CA97"

customRed = "#ff7f7f"

head(pcs) %>% 
  select(state, trend_strength, seasonal_strength_year, seasonal_peak_year) %>% 
  formattable(align= c("l","c","c","c"),
              list(`state` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold")),
              `trend_strength` = color_bar(customGreen0),
              `seasonal_strength_year` = color_bar(customGreen0))
              
  )
state trend_strength seasonal_strength_year seasonal_peak_year
AL 0.3210152 0.05351986 2
AR 0.2111337 0.24063737 3
AZ 0.2508358 0.30104321 2
CA 0.8173091 0.55666069 2
CO 0.2005777 0.27587114 2
CT 0.3305664 0.13416575 3

volcano plot

In these two plots we observe how relatable are the SB time series by state (left plot) and region (right plot).

# How related are the time series

s1 <- pcs %>% 
  mutate(state = as_factor(state)) %>% 
  ggplot( aes(x = .fittedPC1,
             y = .fittedPC2, 
             group = ifelse(state %in% c("NY","CA","NJ","TX","MD","DC","NV","GA", "FL","MI"),"red","black" ),
             col = region
             )
    ) +
   geom_point(aes(label = state #,
                  #col = ifelse(state %in% c("NY","CA","NJ","TX","MD","DC","NV","GA", "FL","MI"),"red","black" )
                  )
                  )+ 
   geom_label_repel(
     data =  . %>% filter(state  %in% c("NY","CA","NJ","TX","MD","DC","NV","GA", "FL","MI")),
                  aes(label = state),
                  box.padding   = 0.5, 
                  point.padding = 0.5,
                  segment.color = 'grey50',
                  size = 2) +
  theme(legend.position = "none", aspect.ratio = 1)

r1 <- pcs %>% 
  ggplot(aes(x = .fittedPC1, y =.fittedPC2, col = region)) +
  geom_point() + 
  geom_label_repel(aes(label = state),
                   data =  . %>% filter(.fittedPC1 > 4000 | .fittedPC2 < -500),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50',
                  size = 2) +
theme(legend.position = "right",
      aspect.ratio = 1)


s1 + r1 

PC1 and PC2 explain most amount of variability among all the features in these time series plots. Here we observe a cluster of states that become slightly less related, then 3 outliers that are drastically different than the others. The cluster of states suggest have the most reliable information and suggests that states with highest similarities are most likely to have similar trends and seasonality or even peak at the same times. The outliers include the states Rhode Island, Wisconsin, and Arizona which will be further analyized.

Time Series Ouliers

Taking a look at the three outliers observed above. The resolution of the data is summed by quarter there I devided by the max summed RSV for each quarter to maintain proportions to normal RSV.

# extracting outliers
outliers <- pcs %>%
  filter(.fittedPC1 > 4000 | .fittedPC2 < -500)

#ploting outliers
outliers %>% left_join(states_q, by = c("state","region")) %>% 
  mutate(series = glue("{state}","{region}", .sep = "\n")) %>% 
  ggplot(aes(x = yearquater, y = hits/max(hits)))+geom_line()+
  facet_grid(series ~.)+
  ggtitle("Outlying time series in PC space")

From these plots we can observe a few things: (i) Arizona’s time series obviously has a strong influential point in the 3 quarter of the year 2009 which can be highly attributed to the rise in iterest of Michael Jackson’s death, being a celebrity with his known status of having Viteligo which is highly associated with skin bleaching. (ii) Rhode Islands circuit breaker “on and off” type of series suggests that SBL RSV is not reliably meeting the threshold of forseable interest and lacks seasonality as most other states do. However, it starts to exibit seasonality at the far end after 2016 which suggest the SBL RSV in Rhode Island is more reliable after that date. (iii) Lastly, Wisconsins SBL RSV becomes more reliable after 2010 and I believe is so differenct because of its lack of seasonality for a term that is general refered to as a seasonal behavior in the literature. The more seasonal, the more reliable the data is becuase it increase the feasability of prediction.